global r4data "/.../data"
global temp "/.../data/temp"
global output "/.../output"
set more off
set linesize 255

*-------------------------------------------------------------------------------
* Author: Dhiren Patki
*
* This program prepares a file that aggregates individual level data into 
* group - firmid - year - gender - age - tenure @ c-5 - state cells
* Regressions are subsequently run at the cell level
*-------------------------------------------------------------------------------

*-------------------------------------------------------------------------------
*Create cell-level file
*-------------------------------------------------------------------------------

*Read in worker file created in 04_toLEHD_ee.sas 

use $r4data/ee_panel_fin.dta, clear 
destring race educ_c, force replace
rename POB pob 
rename DOB dob  

*Check for panel balance
bysort group pik: gen pikrep = _N
tab pikrep

*Merge with CPI figures and make earnings real
merge m:1 year using $r4data/bls_cpi
keep if _merge==3
replace total_earn = total_earn*annual_cpi
replace high_earn = high_earn*annual_cpi
drop _merge

*Fill out individual characteristics
encode sex, gen(sex_enc)
bysort pik: egen temp = max(sex_enc)
replace sex_enc = temp if sex_enc==.
drop sex temp

bysort pik: egen temp = max(race)
replace race = temp if race==.
drop temp

bysort pik: egen temp = max(educ_c)
replace educ_c = temp if educ_c==.
drop temp 

encode ethnicity, gen(eth_enc)
bysort pik: egen temp = max(eth_enc)
replace eth_enc = temp if eth_enc==.
drop temp ethnicity

*Education
gen no_hsd = educ_c==/*redacted*/
gen hsd = educ_c==/*redacted*/
gen some_coll = educ_c==/*redacted*/
gen coll_plus = educ_c==/*redacted*/

*Race
gen white = /*redacted*/ //non-hispanic white
gen black = /*redacted*/ //non-hispanic black
gen asian = /*redacted*/ //non-hispanic asian
gen otrace = /*redacted*/  //non-hispanic otrace

*Hispanic ethnicity
gen hispanic= /*redacted*/

*Native born
gen nat_born =  (pob==/*redacted*/)
bysort pik: egen temp = max(nat_born)
replace nat_born= temp if pob==""
drop temp

*Average earnings prior to sample entry (-infty,c-5)
bysort group pik: egen av_tearn_pre = mean(total_earn) if year<group-5
bysort group pik: egen temp = max(av_tearn_pre)
replace av_tearn_pre = temp if av_tearn_pre==.
drop temp

*Average earnings growth 
gen logearn = ln(total_earn)
gen earn_gwth = .
sort group pik year
//prior to sample entry (-infty,c-5]
replace earn_gwth = logearn-logearn[_n-1] if group==group[_n-1] & pik==pik[_n-1]
bysort group pik: egen av_erngwth_pre = mean(earn_gwth) if year<=group-5
bysort group pik: egen temp = max(av_erngwth_pre)
replace av_erngwth_pre = temp if av_erngwth_pre==.
drop temp

*LFP status
gen lfp=/*redacted*/ //set this variable post start of tenure with DB sponsor
replace lfp=/*redacted*/ if total_earn!=. 

*Retirement
sort group pik year
*last year of paid work
bysort group pik: egen last_pay_yr = max(year) if lfp==/*redacted*/
bysort group pik: egen temp = max(last_pay_yr)
replace last_pay_yr = temp if last_pay_yr==.
drop temp

gen R=0
replace R=1 if year>last_pay_yr

*Age of retirement
bysort group pik: egen age_ret = min(age) if R==/*redacted*/
bysort group pik: egen temp = max(age_ret)
replace age_ret = temp if age_ret==.
drop temp

*Separation from DB sponsoring firm (last year working at DB sponsoring firm)
sort group pik year
replace fas_firm_id = fas_firm_id[_n-1] if fas_ein==fas_ein[_n-1] & pik==pik[_n-1] & year==2014 
replace tenure_alpha = tenure_alpha[_n-1] + 1 if fas_ein==fas_ein[_n-1] & pik==pik[_n-1] & year==2014
gen sep_ind = (firmid!=fas_firm_id[_n+1] & pik==pik[_n+1] & group==group[_n+1] & tenure_alpha!=.)
gen sep_year = sep_ind*year
replace sep_year=. if sep_year==/*redacted*/
bysort pik group: egen first_sep_yr = min(sep_year)
gen ee_sep = (first_sep_yr==sep_year & sep_year!=.) //first separation is the DB separation 
drop sep_year sep_ind first_sep_yr

*Keep obs in year>=group-5
keep if year>=group-5

*Save a skinny file for disclosure analysis
preserve
	keep group pik year sex_enc age_at_c source_at_cm5 no_hsd hsd some_coll coll_plus white black asian hispanic ///
        otrace nat_born R ee_sep lfp firmid fas_firm_id sein source tenure_cm5 age_ret
	save $r4data/disc_piklevel, replace
restore

*Collapse to group - firmid - year - gender - age - tenure @ c-5 - state level bins (not including earnings)
preserve
	gen t = 1 
	collapse (mean) lfp R age_ret ee_sep age cont_tenure_alpha av_tearn_pre av_erngwth_pre ///
	no_hsd hsd some_coll coll_plus white black asian otrace hispanic (sum) numpik=t, ///
	by(group firmid year sex_enc age_at_c tenure_cm5 source_at_cm5) //source_at_cm5 is the geographic state in c-5
	save $r4data/ee_panel_cell1, replace
restore

*Collapse to group - firmid - year - gender - age - tenure @ c-5 - state level bins (only earnings)
preserve
	collapse (mean) total_earn, ///
	by(group firmid year sex_enc age_at_c tenure_cm5 source_at_cm5) cw //cw ensures that missings are not included in the count
	save $r4data/ee_panel_cell2, replace
restore

use $r4data/ee_panel_cell1, clear
merge 1:1 group firmid year sex_enc age_at_c tenure_cm5 source_at_cm5 using $r4data/ee_panel_cell2
assert _merge==1 if lfp==0 
assert _merge!=2
drop _merge

*Create cell level outcomes and controls
gen logearn = ln(total_earn)
gen log_earnpc5 = ln(av_tearn_pre)

*Merge with firm level chars
merge m:1 group firmid year using $r4data/firmpanel_LEHD_m 

tab abv_cov_cut if _merge==2
drop if _merge==2

gen flagmerge= (_merge==1 & year<=group) // pre-event obs not continously observed in LEHD
tab flagmerge

*Remove cells where flag is on
bysort group firmid year sex_enc age_at_c tenure_cm5 source_at_cm5: egen temp = max(flagmerge)
replace flagmerge=temp if flagmerge==0
tab flagmerge
drop if flagmerge==1
drop flagmerge temp
tab _merge
drop _merge

*Fill in first_firmid to define the D's
bysort group firmid: egen temp = max(enc_first_firmid) 
replace enc_first_firmid = temp if enc_first_firmid==.
drop temp

*Drop if missing(enc_first_firmid): this happens for the same reason as _merge==1
drop if enc_first_firmid==.

*Fill in industry when missing
*2 digit
bysort group enc_first_firmid: egen temp = max(mnaics2_e)
replace mnaics2_e=temp if mnaics2_e==.
replace mnaics2_e=0 if mnaics2_e==. //equiv to missing
drop temp
*3 digit
bysort group enc_first_firmid: egen temp = max(mnaics3_e)
replace mnaics3_e=temp if mnaics3_e==.
replace mnaics3_e=0 if mnaics3_e==. //equiv to missing
drop temp

*Redefine the D's and treat_flag post-merge with firm-level data (CB + freeze = treat)
drop D* treat_flag k
gen k = year-group
bysort group enc_first_firmid: egen temp = max(yr_first_trt)
replace yr_first_trt = temp if yr_first_trt==.
drop temp

*First set everything to 0
forvalues k = 1/5{
	gen D_m`k'=0
}
gen D_0 = 0
forvalues k = 1/13{
	gen D_p`k'=0
}

*Update to 1 for the treated group 
sort group tenure_cm5 source_at_cm5 enc_first_firmid sex_enc age_at_c year 
replace D_0=1 if yr_first_trt==group & year==group
forvalues k=1/5{
	replace D_m`k'=1 if year+`k'==year[_n+`k'] & D_0[_n+`k']==1 & enc_first_firmid == enc_first_firmid[_n+`k']
}
forvalues k=1/13{
	replace D_p`k'=1 if year-`k'==year[_n-`k'] & D_0[_n-`k']==1 & enc_first_firmid == enc_first_firmid[_n-`k']

}
	
bysort group enc_first_firmid: egen treat_flag=max(D_0)

*Id for each cell
egen idvar = group(group enc_first_firmid sex_enc age_at_c tenure_cm5 source_at_cm5)

*Save file for analysis
save $r4data/wkerpanel_m, replace

*Clean up working files
erase $r4data/ee_panel_cell2.dta
erase $r4data/ee_panel_cell1.dta
